{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['mnli.tsv.parse',\n",
       " 'dumping-iium.tsv.parse',\n",
       " 'dumping-parliament.tsv.parse',\n",
       " 'qa.tsv.parse',\n",
       " 'chatbot.tsv.parse',\n",
       " 'dumping-wiki.tsv.parse',\n",
       " 'dumping-news.tsv.parse',\n",
       " 'dumping-pdf.tsv.parse',\n",
       " 'summary.tsv.parse',\n",
       " 'quora.tsv.parse',\n",
       " 'snli.tsv.parse',\n",
       " 'dumping-watpadd.tsv.parse',\n",
       " 'stemming.tsv.parse',\n",
       " 'news-title.tsv.parse',\n",
       " 'synonym.tsv.parse']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = glob('*.parse')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "from tqdm import tqdm\n",
    "import collections\n",
    "import tensorflow as tf\n",
    "maxlen = 1024\n",
    "\n",
    "def create_int_feature(values):\n",
    "    feature = tf.train.Feature(\n",
    "        int64_list = tf.train.Int64List(value = list(values))\n",
    "    )\n",
    "    return feature\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_inputs(x, filename):\n",
    "    input_ids, input_masks, segment_ids, ys = [], [], [], []\n",
    "    for i in tqdm(range(len(x))):\n",
    "        tokens = x[i][0]\n",
    "        input_id = tokens\n",
    "        \n",
    "        segment_id = [0] * len(tokens)\n",
    "        input_mask = [1] * len(input_id)\n",
    "        \n",
    "        input_id = input_id + [0] * (maxlen - len(input_id))\n",
    "        segment_id = segment_id + [0] * (maxlen - len(segment_id))\n",
    "        input_mask = input_mask + [0] * (maxlen - len(input_mask))\n",
    "\n",
    "        input_ids.append(input_id)\n",
    "        input_masks.append(input_mask)\n",
    "        segment_ids.append(segment_id)\n",
    "        \n",
    "        r = x[i][1]\n",
    "        r = r + [0] * (maxlen - len(r))\n",
    "        ys.append(r)\n",
    "        \n",
    "    r = tf.python_io.TFRecordWriter(f'{filename}.tfrecord')\n",
    "    for i in tqdm(range(len(ys))):\n",
    "        features = collections.OrderedDict()\n",
    "        features['input_ids'] = create_int_feature(input_ids[i])\n",
    "        features['input_mask'] = create_int_feature(input_masks[i])\n",
    "        features['segment_ids'] = create_int_feature(segment_ids[i])\n",
    "        features['y'] = create_int_feature(ys[i])\n",
    "        tf_example = tf.train.Example(\n",
    "            features = tf.train.Features(feature = features)\n",
    "        )\n",
    "        r.write(tf_example.SerializeToString())\n",
    "    r.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mnli.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 387577/387577 [00:48<00:00, 7924.53it/s] \n",
      "100%|██████████| 387577/387577 [05:47<00:00, 1116.93it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-iium.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 19477/19477 [00:01<00:00, 14919.61it/s]\n",
      "100%|██████████| 19477/19477 [00:17<00:00, 1090.20it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-parliament.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 12812/12812 [00:00<00:00, 13597.24it/s]\n",
      "100%|██████████| 12812/12812 [00:11<00:00, 1084.96it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "qa.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 98619/98619 [00:10<00:00, 9391.01it/s] \n",
      "100%|██████████| 98619/98619 [01:31<00:00, 1077.99it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chatbot.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 325598/325598 [00:37<00:00, 8583.86it/s] \n",
      "100%|██████████| 325598/325598 [04:57<00:00, 1093.41it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-wiki.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 45764/45764 [00:04<00:00, 9956.19it/s] \n",
      "100%|██████████| 45764/45764 [00:42<00:00, 1084.32it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-news.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 48592/48592 [00:06<00:00, 7894.79it/s] \n",
      "100%|██████████| 48592/48592 [00:43<00:00, 1118.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-pdf.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10101/10101 [00:00<00:00, 12582.01it/s]\n",
      "100%|██████████| 10101/10101 [00:09<00:00, 1103.25it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "summary.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 107472/107472 [00:11<00:00, 9000.12it/s] \n",
      "100%|██████████| 107472/107472 [01:37<00:00, 1098.24it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "quora.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 403831/403831 [00:52<00:00, 7657.49it/s] \n",
      "100%|██████████| 403831/403831 [06:05<00:00, 1104.31it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "snli.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 380288/380288 [00:48<00:00, 7761.48it/s] \n",
      "100%|██████████| 380288/380288 [05:44<00:00, 1104.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-watpadd.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24245/24245 [00:01<00:00, 14948.11it/s]\n",
      "100%|██████████| 24245/24245 [00:21<00:00, 1108.62it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stemming.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200000/200000 [00:23<00:00, 8423.83it/s] \n",
      "100%|██████████| 200000/200000 [02:59<00:00, 1111.57it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "news-title.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 120410/120410 [00:15<00:00, 7990.70it/s] \n",
      "100%|██████████| 120410/120410 [01:47<00:00, 1117.19it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "synonym.tsv.parse\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 150000/150000 [00:17<00:00, 8791.39it/s] \n",
      "100%|██████████| 150000/150000 [02:17<00:00, 1091.58it/s]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "for file in files:\n",
    "    print(file)\n",
    "\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "    \n",
    "    get_inputs(data, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dumping-parliament.tsv.parse.tfrecord',\n",
       " 'mnli.tsv.parse.tfrecord',\n",
       " 'dumping-iium.tsv.parse.tfrecord',\n",
       " 'qa.tsv.parse.tfrecord',\n",
       " 'stemming.tsv.parse.tfrecord',\n",
       " 'dumping-pdf.tsv.parse.tfrecord',\n",
       " 'chatbot.tsv.parse.tfrecord',\n",
       " 'synonym.tsv.parse.tfrecord',\n",
       " 'dumping-watpadd.tsv.parse.tfrecord',\n",
       " 'news-title.tsv.parse.tfrecord',\n",
       " 'dumping-wiki.tsv.parse.tfrecord',\n",
       " 'quora.tsv.parse.tfrecord',\n",
       " 'summary.tsv.parse.tfrecord',\n",
       " 'dumping-news.tsv.parse.tfrecord',\n",
       " 'snli.tsv.parse.tfrecord']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('*.tfrecord')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.cloud import storage\n",
    "client = storage.Client()\n",
    "bucket = client.bucket('mesolitica-general')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dumping-parliament.tsv.parse.tfrecord\n",
      "mnli.tsv.parse.tfrecord\n",
      "dumping-iium.tsv.parse.tfrecord\n",
      "qa.tsv.parse.tfrecord\n",
      "stemming.tsv.parse.tfrecord\n",
      "dumping-pdf.tsv.parse.tfrecord\n",
      "chatbot.tsv.parse.tfrecord\n",
      "synonym.tsv.parse.tfrecord\n",
      "dumping-watpadd.tsv.parse.tfrecord\n",
      "news-title.tsv.parse.tfrecord\n",
      "dumping-wiki.tsv.parse.tfrecord\n",
      "quora.tsv.parse.tfrecord\n",
      "summary.tsv.parse.tfrecord\n",
      "dumping-news.tsv.parse.tfrecord\n",
      "snli.tsv.parse.tfrecord\n"
     ]
    }
   ],
   "source": [
    "for file in files:\n",
    "    print(file)\n",
    "    blob = bucket.blob(f'b2b-data/{file}')\n",
    "    blob.upload_from_filename(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 freeze"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
